In [1]:
import numpy as np
import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import LinearRegression

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import display, Markdown, Latex

## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

import warnings
warnings.filterwarnings("ignore")

Linear Regression

The linear regression model is expressed as follows $$Y= \beta_0 + \sum_{j=1}^{p}\beta_{j} X_{j},$$ where $X_j$ represents the j-th predictor and $\beta_j$ quantifies the association between that variable and the response.

In case that $p=2$, $\beta_0$ and $\beta_1$ are known the intercept and slope terms, respectively.

In practice, we cannot identify $\beta_0,~\beta_1,~\ldots,~\beta_p$. Instead, we can have estimates $\hat{\beta}_0,\hat{\beta}_1, \ldots, \hat{\beta}_p$. Given estimates $\hat{\beta}_0$, $\hat{\beta}_1$, $\ldots$, $\hat{\beta}_p$, the following formula can be used for predictions, $$\hat{y}=\hat{\beta}_0 +\hat{\beta}_1x_1 +\hat{\beta}_2 x_2 + \ldots +\hat{\beta}_p x_p.$$

Using the least-squares approach, $\beta_0$, $\beta_1$, $\ldots$, $\beta_p$ can be chosen to minimize the sum of squared residuals

\begin{align} RSS=\sum_{i=1}^{n}(y_i-\hat{y}_i)^2=\sum_{i=1}^{n}(y_i-\hat{\beta}_0 - \hat{\beta}_1 x_{i1} - \hat{\beta}_2 x_{i2} - \hat{\beta}_3 x_{i3}-\ldots - \hat{\beta}_p x_{ip})^2. \end{align}

There are a few definitions that we need throughout this document.

$R^2$ Statistic

The $R^2$ statistic is a measure of the linear relationship between $X$ and $Y$. To calculate $R^2$ , we use the formula $$R^2 =1-\frac{\sum_{i=1}^{n}(y_i − \hat{y}_i) 2}{\sum_{i=1}^{n}(y_i − \bar{y}) 2}$$

Correlation Matrix

It is a matrix in which i-j position defines the correlation between the ith and jth parameter of the given data-set. Defined as correlation $$Cor(X,Y )=\frac{\sum_{i=1}^{n}(x_i − \bar{x})(y_i − \bar{y})}{\sqrt{\sum_{i=1}^{n}(x_i − \bar{x})^2}\sqrt{\sum_{i=1}^{n}(y_i − \bar{y})^2}} $$

Correlation Matrix is basically a covariance matrix. Also known as the auto-covariance matrix, dispersion matrix, variance matrix, or variance-covariance matrix.

Advertising Example

In this section, we work on the ISLR Advertising dataset which consists of the sales of that product in 200 different markets, along with advertising budgets for the product in each of those markets for three different media: TV, radio, and newspaper. The dataset is available at this link.

In [2]:
advertising = pd.read_csv('http://faculty.marshall.usc.edu/gareth-james/ISL/Advertising.csv', index_col = 0)
advertising .columns = [x.title().replace('Tv','TV') for x in advertising.columns]
advertising.head()
Out[2]:
TV Radio Newspaper Sales
1 230.1 37.8 69.2 22.1
2 44.5 39.3 45.1 10.4
3 17.2 45.9 69.3 9.3
4 151.5 41.3 58.5 18.5
5 180.8 10.8 58.4 12.9

Here, $\beta_j$ is interpreted as the average effect on $Y$ of a one unit increase in $X_j$, holding all other predictors fixed. We have $$sales = \beta_0 + \beta_1 × TV + \beta_2 \times radio + \beta_3 \times newspaper + \epsilon. $$

$\beta_0$, $\beta_1$, $\beta_2$ and $\beta_3$

In [3]:
Results = smf.ols('Sales ~ TV + Radio + Newspaper', advertising).fit()
Results.summary().tables[1]
Out[3]:
coef std err t P>|t| [0.025 0.975]
Intercept 2.9389 0.312 9.422 0.000 2.324 3.554
TV 0.0458 0.001 32.809 0.000 0.043 0.049
Radio 0.1885 0.009 21.893 0.000 0.172 0.206
Newspaper -0.0010 0.006 -0.177 0.860 -0.013 0.011

As can be seen,

  • For a fixed amount of TV and newspaper advertising, spending an additional $1,000 on radio advertising can increase sales by approximately 189 units.

  • For a fixed amount of Radio and newspaper advertising, spending an additional $1,000 on TV advertising can increase sales by approximately 46 units.

  • However, the coefficient estimate for the newspaper is very close to zero, and the corresponding p-value is insignificant.

RSS:

In [4]:
# RSS with regression coefficients
RSS = ((advertising.Sales - (Results.params[0] + Results.params[1]*advertising.TV
                             + Results.params[1]*advertising.Radio
                             + Results.params[2]*advertising.Newspaper))**2).sum()/1000
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'RSS with regression coefficients' + Style.RESET_ALL + ' = %.4f' % RSS)
del RSS
RSS with regression coefficients = 4.8304

Correlation Matrix and plot:

In [5]:
display(advertising.corr().style.background_gradient(cmap='RdBu').set_precision(2))

def Correlation_Plot (Df,Fig_Size):
    Correlation_Matrix = Df.corr().round(2)
    mask = np.zeros_like(Correlation_Matrix)
    mask[np.triu_indices_from(mask)] = True
    for i in range(len(mask)):
        mask[i,i]=0
    fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
    sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True, 
                cmap =sns.color_palette("RdBu", n_colors=10), linewidths = 0.2, vmin=0, vmax=1,
                annot_kws={"size": 14}, cbar_kws={"shrink": .7})
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    
Correlation_Plot (advertising,6)
TV Radio Newspaper Sales
TV 1.00 0.05 0.06 0.78
Radio 0.05 1.00 0.35 0.58
Newspaper 0.06 0.35 1.00 0.23
Sales 0.78 0.58 0.23 1.00

We can see that the correlation between radio and newspaper is about 0.35. In other words, we can see a tendency to spend more on newspaper advertising in markets where more is spent on radio advertising.

Since p-value of the newspaper advertising is insignificant, we can provide a 3D diagram with sale, TV, and Radio advertising data.

$$sales = \beta_0 + \beta_1 × TV + \beta_2 \times radio$$
In [6]:
reg = LinearRegression().fit(advertising[['Radio', 'TV']].values,advertising.Sales)
display(Latex(r'$\beta_0 = %.4f$, $\beta_1 = %.4f$, and $\beta_2 = %.4f$' % (reg.intercept_, reg.coef_[0], reg.coef_[1])))
$\beta_0 = 2.9211$, $\beta_1 = 0.1880$, and $\beta_2 = 0.0458$
In [7]:
fig = go.Figure()
fig.add_trace(go.Scatter3d(x= advertising.Radio, y= advertising.TV, z= advertising.Sales,
                           mode='markers', marker=dict(size=4, color=advertising.Sales, colorscale='Blues', opacity=0.8)))
# Creating a coordinate grid
X, Y = np.meshgrid(np.arange(0, np.ceil(advertising.Radio.max())),
            np.arange(0, np.ceil(advertising.TV.max())), indexing='xy')
Z = np.zeros(X.shape)
for (i,j),v in np.ndenumerate(Z):
        Z[i,j] =(reg.intercept_ + X[i,j]*reg.coef_[0] + Y[i,j]*reg.coef_[1])
fig.add_trace(go.Surface(z=Z, x=X, y=Y, colorscale='Greens', colorbar_len=0.6))
# tight layout
fig.update_layout(title="Regression: Sales ~ Radio + TV Advertising", xaxis_title="Radio", yaxis_title="TV")
fig['layout']['xaxis'].update(range=[0, 50])
fig['layout']['yaxis'].update(range=[0, 300])
fig.show()
del X, Y, Z

Note that we do not have to define our linear model in additive way. For example we can define a linear model that uses radio, TV, and an interaction between the two to predict sales takes the form \begin{align} sales &= \beta_0 + \beta_1 \times TV + \beta_2 \times radio + \beta_3 \times ( radio \times TV ) + \epsilon\\ &= \beta_0 + (\beta_1 + \beta_3 \times radio ) \times TV + \beta_2 \times radio + \epsilon. \end{align}

In [8]:
Results = smf.ols('Sales ~ TV + Radio + TV*Radio', advertising).fit()
Results.summary().tables[1]
Out[8]:
coef std err t P>|t| [0.025 0.975]
Intercept 6.7502 0.248 27.233 0.000 6.261 7.239
TV 0.0191 0.002 12.699 0.000 0.016 0.022
Radio 0.0289 0.009 3.241 0.001 0.011 0.046
TV:Radio 0.0011 5.24e-05 20.727 0.000 0.001 0.001

The results clearly suggest that the model that includes the interaction term is superior to the model that contains only the main effects. Since the p-value for the interaction term, TV x radio, is noticeably low, it is clear that the true relationship is not additive.

Credit Example

The Credit dataset contains the balance (average credit card debt for a number of individuals) as well as several quantitative predictors: age, cards (number of credit cards), education (years of education), income (in thousands of dollars), limit (credit limit), and rating (credit rating).

This dataset can be extracted from the ISLR package using the following syntax.

library (ISLR)
write.csv(Credit, "Credit.csv")
In [9]:
credit = pd.read_csv('Data/Credit.csv', index_col = 0)
credit.head(5)
Out[9]:
ID Income Limit Rating Cards Age Education Gender Student Married Ethnicity Balance
1 1 14.891 3606 283 2 34 11 Male No Yes Caucasian 333
2 2 106.025 6645 483 3 82 15 Female Yes Yes Asian 903
3 3 104.593 7075 514 4 71 11 Male No No Asian 580
4 4 148.924 9504 681 3 36 11 Female No No Asian 964
5 5 55.882 4897 357 2 68 16 Male No Yes Caucasian 331

The Credit data set contains information about balance, age, cards, education, income, limit, and rating for a number of potential customers.

In [10]:
sns.pairplot(credit[['Balance','Age','Cards','Education','Income','Limit','Rating']]);

Each panel of the Figure is a scatterplot for a pair of variables whose identities are given by the corresponding row and column labels.

For example, the scatterplot directly to the right of the word “Balance” depicts balance versus age , while the plot directly to the right of “Age” corresponds to age versus cards . In addition to these quantitative variables, we also have four qualitative variables: gender , student (student status), status (marital status), and ethnicity (Caucasian, African American or Asian).

Predictors with Only Two Levels

We simply create an indicator or dummy variable that takes on two possible dummy variable numerical values (binary values).

For example, based on the gender variable, we can create a new binary variable that takes the form $$x_i =\begin{cases}1, & \mbox{if ith person is female}, \\0, & \mbox{if ith person is male},\end{cases}$$

and the regression equation:

$$y_i = \beta_0 + \beta_1 x i + \epsilon_i =\begin{cases}\beta_0 + \beta_1 + \epsilon_i, & \mbox{if ith person is female}, \\\beta_0 + \epsilon_i , & \mbox{if ith person is male.}\end{cases}$$
In [11]:
Results = smf.ols('Balance ~ C(Gender)', credit).fit()
Results.summary().tables[1]
Out[11]:
coef std err t P>|t| [0.025 0.975]
Intercept 509.8031 33.128 15.389 0.000 444.675 574.931
C(Gender)[T.Female] 19.7331 46.051 0.429 0.669 -70.801 110.267
$$y_{i} = \beta_0 +\beta_1 x_{i1} +\beta_2 x_{i2} +\epsilon_i = \begin{cases} \beta_0 +\beta_1 +\epsilon_i , & \mbox{if ith person is Asian},\\ \beta_0 +\beta_2 +\epsilon_i, & \mbox{ if ith person is Caucasian},\\ \beta_0 +\epsilon_i , & \mbox{if ith person is African American}. \end{cases}$$

In [12]:
Results = smf.ols('Balance ~ C(Ethnicity)', credit).fit()
Results.summary().tables[1]
Out[12]:
coef std err t P>|t| [0.025 0.975]
Intercept 531.0000 46.319 11.464 0.000 439.939 622.061
C(Ethnicity)[T.Asian] -18.6863 65.021 -0.287 0.774 -146.515 109.142
C(Ethnicity)[T.Caucasian] -12.5025 56.681 -0.221 0.826 -123.935 98.930
$$y_i = \beta_0 + \beta_1 x i + \epsilon_i =\begin{cases}\beta_0 + \beta_1 + \epsilon_i, & \mbox{if ith person is Married}, \\\beta_0 + \epsilon_i , & \mbox{if ith person is not Married.}\end{cases}$$
In [13]:
Results = smf.ols('Balance ~ C(Married)', credit).fit()
Results.summary().tables[1]
Out[13]:
coef std err t P>|t| [0.025 0.975]
Intercept 523.2903 36.974 14.153 0.000 450.601 595.980
C(Married)[T.Yes] -5.3475 47.244 -0.113 0.910 -98.227 87.532

There is an interaction term between income and student. We have

\begin{align} \text{balance}_i \approx \beta_0 + \beta_1 \times \text{income}_i + \begin{cases} \beta_2, & \mbox{if ith person is a student},\\0, & \mbox{if ith person is not a student} \\ \end{cases} =\beta_1 \times \text{income}_i + \begin{cases} \beta_0 + \beta_2, & \mbox{if ith person is a student},\\ \beta_0, & \mbox{if ith person is not a student} \\ \end{cases} \end{align}

\begin{align} \text{balance}_i &\approx \beta_0 + \beta_1 \times \text{income}_i + \begin{cases} \beta_2+\beta_3 \times \text{income}_i, & \mbox{if ith person is a student},\\0, & \mbox{if ith person is not a student} \\ \end{cases}\\ &=\begin{cases} (\beta_0+\beta_2)+(\beta_1+\beta_3) \times \text{income}_i, & \mbox{if ith person is a student},\\ \beta_0 + \beta_1 \times \text{income}_i, & \mbox{if ith person is not a student} \\ \end{cases} \end{align}

Regression 1 - without interaction term:

In [14]:
Results1 = smf.ols('Balance ~ Income + Student', credit).fit()
reg1 = Results1.params
Results1.summary().tables[1]
Out[14]:
coef std err t P>|t| [0.025 0.975]
Intercept 211.1430 32.457 6.505 0.000 147.333 274.952
Student[T.Yes] 382.6705 65.311 5.859 0.000 254.272 511.069
Income 5.9843 0.557 10.751 0.000 4.890 7.079

Regression 2 - with interaction term:

In [15]:
Results2 = smf.ols('Balance ~ Income + Income*Student', credit).fit()
reg2 = Results2.params
Results2.summary().tables[1]
Out[15]:
coef std err t P>|t| [0.025 0.975]
Intercept 200.6232 33.698 5.953 0.000 134.373 266.873
Student[T.Yes] 476.6758 104.351 4.568 0.000 271.524 681.827
Income 6.2182 0.592 10.502 0.000 5.054 7.382
Income:Student[T.Yes] -1.9992 1.731 -1.155 0.249 -5.403 1.404
In [16]:
fig = go.Figure()
fig = make_subplots(rows=1, cols=2)

# x-axis (Income)
Max_Income = np.ceil(credit['Income'].max())
income = np.linspace(0, Max_Income)

# y-axis (Balance without interaction term)
st1 = np.linspace(reg1['Intercept']+reg1['Student[T.Yes]'],
                  reg1['Intercept']+reg1['Student[T.Yes]'] + Max_Income*reg1['Income'])
non_st1 =  np.linspace(reg1['Intercept'], reg1['Intercept'] + Max_Income*reg1['Income'])

# y-axis (Balance with iteraction term)
st2 = np.linspace(reg2['Intercept']+reg2['Student[T.Yes]'],
                         reg2['Intercept']+reg2['Student[T.Yes]']+Max_Income*(reg2['Income']+reg2['Income:Student[T.Yes]']))
non_st2 =  np.linspace(reg2['Intercept'], reg2['Intercept']+Max_Income*reg2['Income'])

# Left, 
fig.add_trace(go.Scatter(x= income, y= st1, line=dict(color='OrangeRed', width= 1.5), 
                         name = 'Student'), 1, 1)
fig.add_trace(go.Scatter(x= income, y= non_st1, line=dict(color='MidnightBlue', width= 1.5), 
                         name = 'Non-Student'), 1, 1)
# right
fig.add_trace(go.Scatter(x= income, y= st2, line=dict(color='OrangeRed', width= 1.5), 
                         name = 'Student', showlegend = False), 1, 2)
fig.add_trace(go.Scatter(x= income, y= non_st2, line=dict(color='MidnightBlue', width= 1.5), 
                         name = 'Non-Student', showlegend = False), 1, 2)

# Update
fig.update_xaxes(title_text='Income', range=[0, 2e2], row=1, col=1)
fig.update_xaxes(title_text='Income', range=[0, 2e2], row=1, col=2)
fig.update_yaxes(title_text='Balance', range=[0, 2e3], row=1, col=1)
fig.update_yaxes(title_text='Balance', range=[0, 2e3], row=1, col=2)
fig.update_layout(title_text="Balance with iteraction term", legend_title_text='Status')
fig.show()

Refrences